

/*------------------------------------------------------------------------------
	1. Housekeeping
------------------------------------------------------------------------------*/
	*Run housekeeping code
		qui do ".../replication_package/housekeeping.do"

	*Write log
		log using "$log_loc/figure_6.log", text replace

/*------------------------------------------------------------------------------
	2. Clean Crosswalk
------------------------------------------------------------------------------*/

	*Loop through years: 
		local year = 2018

			*Import zip_code-tract crosswalk
			import excel using "${data}/Zip-Census Tract/ZIP_TRACT_12`year'.xlsx", clear first

				*Make variables lower case
					ren *, lower
	
				*Keep variables needed for the crosswalk
					keep tract tot_ratio zip
	
				*Rename zip code
					ren zip zip_code

				*Drop tract-zip combinations where tract does not correspond to any zip code
					drop if tot_ratio == 0
	
				*Check id
					isid tract zip_code
	
			*Save dataset 
			tempfile zip_cross_`year'
			save `zip_cross_`year'', replace

/*------------------------------------------------------------------------------
	3. Estimate share of (kids age_numd 0-14)/(kids age_numd 0-17)
------------------------------------------------------------------------------*/

			import excel using "$data/Census Count/nc-est2019-syasexn.xlsx", clear cellra(A6:M23)
				keep A K L M
				ren (A K L M) (age_num census_2017 census_2018 census_2019)

			tempfile census_data
			save `census_data', replace

			import excel using "$data/Census Count/nc-est2022-syasexn.xlsx", clear cellra(A6:E23)
				keep A C D
				ren (A C D ) (age_num census_2020 census_2021)

				merge 1:1 age_num using `census_data'
					assert _merge == 3
					drop _merge

				replace age_num = substr(age_num, 2, .)
					destring age_num, replace
					sort age_num
	
				*Make data long (i.e., by age_num and year instead of just by age_num)
					reshape long census_, i(age_num) j(year)
					ren census_ census_ct
					isid year age_num

				*Keep if year is the one to be analyzed
					keep if year == `year'

				*Estimate the share of children under 14
					qui sum if age_num <= 14,d
					local ct_14_and_under = r(sum)

					qui sum if age_num <= 17, d
					local ct_17_and_under = r(sum)

					local share_14_and_under = `ct_14_and_under'/`ct_17_and_under'
					di "For `year', there are `ct_14_and_under' children 14 and under and `ct_17_and_under' children 17 and under"
					di "This corresponds to a share of `share_14_and_under' children being 14 and under"

/*------------------------------------------------------------------------------
	4. Prepare Kids Per Tract Dataset
------------------------------------------------------------------------------*/		
		*Import Household Age Data
		import delimited using "$data/B09001/ACSDT5Y`year'.B09001-Data.csv", clear rowrange(2) varn(2)

			*Keep key variables
				keep geography geographicareaname estimatetotal estimatetotalinhouseholds estimatetotalinhouseholdsunder3y estimatetotalinhouseholds3and4ye estimatetotalinhouseholds5years estimatetotalinhouseholds6to8yea estimatetotalinhouseholds9to11ye estimatetotalinhouseholds12to14y estimatetotalinhouseholds15to17y

			*Rename variables
				ren (geography geographicareaname estimatetotal estimatetotalinhouseholds estimatetotalinhouseholdsunder3y estimatetotalinhouseholds3and4ye estimatetotalinhouseholds5years estimatetotalinhouseholds6to8yea estimatetotalinhouseholds9to11ye estimatetotalinhouseholds12to14y estimatetotalinhouseholds15to17y) (tract tract_name total in_hh age_num_less_than_3 age_num_3_to_4 age_num_5 age_num_6_to_8 age_num_9_to_11 age_num_12_to_14 age_num_15_to_17)

			*Clean tract variable (last 11 values in tract variable)
				replace tract = substr(tract, -11, 11)

			*Get ct from 0-14
				gen age_num_0_14_hh = age_num_less_than_3 + age_num_3_to_4 + age_num_5 + age_num_6_to_8 + age_num_9_to_11 + age_num_12_to_14
		
				/*Estimate the proportion of non-household kids that are 14 by multiplying the number of kids (0-17) 
				by the proportion of kids in Census that are 0-14*/ 
					gen in_group_quarters = total - in_hh
					gen age_num_0_14_gq = in_group_quarters * `share_14_and_under'

				*Estimate the total number of kids that are 0-14
					gen age_num_0_14_acs = age_num_0_14_gq + age_num_0_14_hh

				*Keep key variables
					keep tract age_num_0_14_acs 

		tempfile kids
		save `kids', replace

/*------------------------------------------------------------------------------
	5. Prepare race-by-tract dataset
------------------------------------------------------------------------------*/
		*Import Race Data
		import delimited using "$data/DPO5/ACSDP5Y`year'.DP05-Data.csv", clear rowrange(2) varn(2)
	
			*Keep Key variables
				keep geography geographicareaname v143 estimatehispanicorlaidoandracet v155 v157 v159 v161 v163 v165 v167

			*Rename variables
				ren (geography geographicareaname v143 estimatehispanicorlaidoandracet v155 v157 v159 v161 v163 v165 v167) (tract tract_name hisp tot white black native_american asian pac_isl other_race multi_race)

			*Clean tract variable (last 11 values in tract variable)
				replace tract = substr(tract, -11, 11)

			*merge with kids dataset
				merge 1:1 tract using `kids'
					keep if _merge == 3
					drop _merge

				merge 1:m tract using `zip_cross_`year''
					keep if _merge == 3
					drop _merge

			/*Tot_ratio = ratio of addresses in tract-zip record to total number of addresses in tract
			To allocate data from tract to zip, multiple number of observations in tract by tot_ratio for each zip associated with tract */
			foreach i of varlist age_num_0_14_acs hisp tot white black {
				replace `i' = `i' * tot_ratio
			}

			collapse (sum) age_num_0_14_acs hisp tot white black, by(zip_code)

			*Generate pct race estimates by zip code
				gen pct_hisp = 100 * hisp/tot
				gen pct_white = 100 * white/tot
				gen pct_black = 100 * black/tot
	
		tempfile race_data_`year'
		save `race_data_`year'', replace

/*------------------------------------------------------------------------------
	6. Clean Main Datasets
------------------------------------------------------------------------------*/
		*Import Data
		use "$data/childrens`year'.dta", clear

			/*Restrict Sample*/
				*Keep observations in the 51 states (per the f1095 variable). Note that most of the dropped state values are missing.
					rename state_insurance state
					fmerge m:1 state using "$data/statelist.dta"
						assert _merge != 2 //Confirm that all 51 states are in dataset
						drop if _merge != 3

				*Keep if age_num is within the range
					keep if age_num >= 0 & age_num <= 16

				*Keep if kid died after end of year or is currently alive
					keep if `year'*10000+1231<death_date|death_date==0

				*Keep if months covered is possible
					keep if num_cov_ins > 0 & num_cov_ins <= 12

			/*Generate variables for figure*/	
				*Generate outcame variable on claiming
					gen onreturn = file_inc != .

				*Rename zip code variable
					ren (zipcode_ins) (zip_code)

				*Generate a count variable
					gen ct = 1

				*Collapse to the by-zipcode level		
					collapse (sum) ct onreturn, by(zip_code)

				*Adjust for claiming limits
					replace onreturn = onreturn * 1.0017029

				*Generate outcome (i.e., percent of children claimed)
					gen perc_return = onreturn/ct

				*Adjust outcome to be out of 100
					replace perc_return = perc_return*100
			
				*Merge with main dataset
					merge 1:1 zip_code using `race_data_`year''
						keep if _merge == 3

/*------------------------------------------------------------------------------
	7. Create Figure 4
------------------------------------------------------------------------------*/
			/*Create figure*/		
				binscatter perc_return pct_black [w = age_num_0_14_acs], nq(20) ///
					xtitle("% Black") ytitle("% Children Claimed") ///
					reportreg
		
					graph export "${output}/`year' Percent Black.png", replace as(png)


				binscatter perc_return pct_white [w = age_num_0_14_acs], nq(20) ///
					xtitle("% White") ytitle("% Children Claimed") ///
					reportreg

					graph export "${output}/`year' Percent White.png", replace as(png)


				binscatter perc_return pct_hisp [w = age_num_0_14_acs], nq(20) ///
					xtitle("% Hispanic") ytitle("% Children Claimed") ///
					reportreg

					graph export "${output}/`year' Percent Hispanic.png", replace as(png)	